###########################################################
# Kerice Doten-Snitker
#
# Städtebund Data - prep and cleaning
# 
# Built under R 3.4.3 
# Platform: x86_64-apple-darwin15.6.0 (64-bit)
###########################################################

# # set locale to preempt UTF-8 issues with the mydata
# Sys.getlocale()
# # for Mac
# Sys.setlocale(category = "LC_ALL", locale = "en_US.UTF-8")
# #for windows
# #Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
# 
# # add packages with the pacman library, which installs if not installed
# library(pacman)
# # use the here package for improving replicability
# p_load(here, haven)
# # for data manipulation
# p_load(magrittr, plyr, tidyverse, reshape2)
# 
# set.seed(1418)

###
# Observations are alliance treaties
# with their parties and purposes
###

# add the Städtebund data to the R environment 
mydata_s1 <- read_csv(here::here("in_data","Distler_Staedtebunde_cleaned.csv"))

# 1: Break out alliance parties ------------------------------------------

# from string column of parties to list-column

mydata_s1$PlaceName <- mydata_s1$Participants %>%
  str_replace_all("\\.\\s", "_") %>% # replace .\s in party names with _;
  str_replace_all("\\.", "_") %>% # . in party names with _;
  str_replace_all("\\b\\s", "_") %>% # and \s after a word boundary with _
  str_extract_all("[\\w|-]+\\b") # then extract items to make list col

# from list-column to year-party rows

mydata_s2 <- mydata_s1 %>%
  dplyr::select(Städtebund, Year, Participants, PlaceName) %>%
  unnest(PlaceName)

# 2: add Place IDs  ----------------------------------------------------

# get main data, then reduce to matrix of {PlaceName, PlaceID}
mydata_b <- read.csv(here::here("in_data","Haverkamp_Ortskatalog_limited_notext.csv"), 
  header=TRUE, encoding = "latin1", stringsAsFactors=FALSE)

ID_table <- mydata_b %>%
  dplyr::select(PlaceName, PlaceID) %>%
  distinct(PlaceID, .keep_all = TRUE)

# merge ID table with alliance data

mydata_s3 <- merge(mydata_s2, ID_table, by.x=c("PlaceName"), by.y=c("PlaceName"), all.x=FALSE, all.y=FALSE)

# 3: clean up for merging ------------------------------------------------

mydata_s3 <- mydata_s3 %>%
  group_by(PlaceName) %>%
  arrange(Year) %>%
  mutate(Bund = 1) %>% # all obs were party to Bund - set up for binary after merge
  ungroup() %>%
  dplyr::select(Year, PlaceID, PlaceName, Städtebund, Participants, Bund) %>%
  arrange(Year, PlaceID)

saveRDS(mydata_s3, here::here("in_data", "Distler_Staedtebunde_prepped.rds"))

# x: summary stats on Bunde  ---------------------------------------------

Bunde <- mydata_s3 %>%
  group_by(Städtebund, Year) %>%
  summarize(
    TotalMembers = n()
  )

# hist of alliance years
ggplot(data=Bunde, aes(Year, fill=Städtebund)) +
  geom_histogram(position="dodge", binwidth=20) +
  theme_classic()
# scatter of years and total of members
ggplot(data=Bunde, aes(Year, TotalMembers))+
  geom_point(position="jitter", aes(color=Städtebund, shape=Städtebund)) +    
  theme_classic()
# hist of alliance years
ggplot(data=mydata_s3, aes(Year, fill=Städtebund))+
  geom_histogram(position="dodge", binwidth=20) +
  theme_classic()

